# -*- coding: utf-8 -*-
# Python 3.8.3
"""
博尔州立大学艺术馆中国藏品爬虫
http://ballstate.dom5183.com:8080/objects-1/info/7525?query=mfs%20any%20%22Chinese%22&sort=9
"""
import requests
import json
from lxml import etree

# 第一页链接
start_url = "http://ballstate.dom5183.com:8080/objects-1/info/7525?query=mfs%20any%20%22Chinese%22&sort=9"

current_url = start_url

count = 1
while count<=88:
    resp = requests.get(current_url)
    html = etree.HTML(resp.text)

    url = current_url
    img_url = "http://ballstate.dom5183.com:8080/" + str(html.xpath("/html/body/main/div[2]/div/div[1]/div[1]/a/@href")[0]).strip()
    title = str(html.xpath("/html/body/main/div[2]/div/div[1]/div[2]/h4/text()")[0]).strip()
    medium = str(html.xpath("/html/body/main/div[2]/div/div[1]/div[2]/div[1]/text()")[0]).strip()
    dimensions = str(html.xpath("/html/body/main/div[2]/div/div[1]/div[2]/div[2]/text()")[0]).strip()
    credit = str(html.xpath("/html/body/main/div[2]/div/div[1]/div[2]/div[3]/text()")[1]).strip()
    time_period = str(html.xpath("/html/body/main/div[2]/div/div[1]/div[2]/div[6]/text()")[0]).strip()
    cat = str(html.xpath("/html/body/main/div[2]/div/div[1]/div[2]/div[7]/text()")[1]).strip()
    on_view = str(html.xpath("/html/body/main/div[2]/div/div[1]/div[2]/div[8]/text()")[0]).strip()



    d = {}  # dict
    d["url"] = url    # 原本详情页链接
    d["type"] = None
    d["title"] = title # 展品标题
    dd = {}
    dd["object_name"] = None
    dd["object_type"] = None    # 展区
    dd["geography"] = "China"        # 产地
    dd["credit"] = credit              # 捐赠信息
    dd["reign"] = None                # 朝代
    dd["dynasty"] = None            
    dd["period"] = None              
    dd["time_period"] = time_period    # 时期
    dd["dated"] = None
    dd["date_begin"] = None
    dd["date_end"] = None
    dd["dimensions"] = dimensions      # 规格
    dd["medium"] = medium              # 材质
    dd["inscription"] = None    
    dd["previous_owner"] = None   # 之前拥有者
    dd["label"] = None                 # 人物故事
    dd["makers"] = None               # 制作者
    dd["markings"] = None
    dd["notes"] = None
    dd["on_view"] = on_view
    dd["on_view_location"] = None
    dd["provenance"] = None      # 出处（文物故事）
    dd["bibliography"] = None  # 参考文献
    dd["remarks"] = None            
    dd["si_usage_statement"] = None
    d["mata"] = dd
    ddd = {}
    ddd["0"] = img_url
    d["img_url"] = ddd
    d["cat"] = cat

    with open("do.json", encoding="utf-8", mode="a") as file:
        file.write("\""+str(count)+"\":"+json.dumps(d)+",")
    with open("img_url_do.txt", mode="a") as f:
        f.write(str(img_url)+"\n")
    # 找下一页链接
    current_url = "http://ballstate.dom5183.com:8080" + str(html.xpath('//a[contains(text(),"Next")]/@href')[0])
    count += 1
